Initialise the libs


In [326]:
import numpy as np
import pandas as pa
import matplotlib.pyplot as plt
from sklearn import linear_model

Load the data


In [327]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int,
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float,
              'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int,
              'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
regressionDir = '/media/weenkus/The Beast/Programming/Workspace/Projects/Machine-Learning-University-of-Washington/'

In [328]:
house_train = pa.read_csv(regressionDir + 'Regression/datasets/wk3_kc_house_train_data.csv', dtype = dtype_dict)
house_valid = pa.read_csv(regressionDir + 'Regression/datasets/wk3_kc_house_valid_data.csv', dtype = dtype_dict)
house_test = pa.read_csv(regressionDir + 'Regression/datasets/wk3_kc_house_test_data.csv', dtype = dtype_dict)
house_set1 = pa.read_csv(regressionDir + 'Regression/datasets/wk3_kc_house_set_1_data.csv', dtype = dtype_dict)
house_set2 = pa.read_csv(regressionDir + 'Regression/datasets/wk3_kc_house_set_2_data.csv', dtype = dtype_dict)
house_set3 = pa.read_csv(regressionDir + 'Regression/datasets/wk3_kc_house_set_3_data.csv', dtype = dtype_dict)
house_set4 = pa.read_csv(regressionDir + 'Regression/datasets/wk3_kc_house_set_4_data.csv', dtype = dtype_dict)
sales = pa.read_csv(regressionDir + 'Regression/datasets/kc_house_data.csv', dtype = dtype_dict)

Exploring the data


In [329]:
house_train.head()


Out[329]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
0 2487200875 20141209T000000 604000 4 3.00 1960 5000 1 0 0 ... 7 1050 910 1965 0 98136 47.5208 -122.393 1360 5000
1 7237550310 20140512T000000 1225000 4 4.50 5420 101930 1 0 0 ... 11 3890 1530 2001 0 98053 47.6561 -122.005 4760 101930
2 9212900260 20140527T000000 468000 2 1.00 1160 6000 1 0 0 ... 7 860 300 1942 0 98115 47.6900 -122.292 1330 6000
3 0114101516 20140528T000000 310000 3 1.00 1430 19901 1.5 0 0 ... 7 1430 0 1927 0 98028 47.7558 -122.229 1780 12697
4 6054650070 20141007T000000 400000 3 1.75 1370 9680 1 0 0 ... 7 1370 0 1977 0 98074 47.6127 -122.045 1370 10208

5 rows × 21 columns


In [330]:
# Show plots in jupyter
%matplotlib inline

plt.scatter(house_train.price, house_train.bedrooms, alpha=0.5)
plt.ylabel('price')
plt.xlabel('bedrooms')
plt.show()



In [331]:
plt.scatter(house_train.price, house_train.sqft_living, alpha=0.5)
plt.ylabel('price')
plt.xlabel('sqft_living')
plt.show()



In [332]:
plt.scatter(house_train.price, house_train.zipcode, alpha=0.5)
plt.ylabel('price')
plt.xlabel('zipcode')
plt.show()


Function for adding new features


In [333]:
def polynomial_dataframe(feature, degree): # feature is pandas.Series type
    # assume that degree >= 1
    # initialize the dataframe:
    poly_dataframe = pa.DataFrame()
    # and set poly_dataframe['power_1'] equal to the passed feature
    poly_dataframe['power_1'] = feature

    # first check if degree > 1
    if degree > 1:
        # then loop over the remaining degrees:
        for power in range(2, degree+1):
            # first we'll give the column a name:
            name = 'power_' + str(power)
            # assign poly_dataframe[name] to be feature^power; use apply(*)
            poly_dataframe[name] = feature;
            poly_dataframe[name] = poly_dataframe[name].apply(lambda x: x**power)
    return poly_dataframe

In [334]:
sales = sales.sort(['sqft_living','price'])
sales.head()


/home/weenkus/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  if __name__ == '__main__':
Out[334]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
19452 3980300371 20140926T000000 142000 0 0.00 290 20875 1 0 0 ... 1 290 0 1963 0 98024 47.5308 -121.888 1620 22850
15381 2856101479 20140701T000000 276000 1 0.75 370 1801 1 0 0 ... 5 370 0 1923 0 98117 47.6778 -122.389 1340 5000
860 1723049033 20140620T000000 245000 1 0.75 380 15000 1 0 0 ... 5 380 0 1963 0 98168 47.4810 -122.323 1170 15000
18379 1222029077 20141029T000000 265000 0 0.75 384 213444 1 0 0 ... 4 384 0 2003 0 98070 47.4177 -122.491 1920 224341
4868 6896300380 20141002T000000 228000 0 1.00 390 5900 1 0 0 ... 4 390 0 1953 0 98118 47.5260 -122.261 2170 6000

5 rows × 21 columns

Add new features with the new function


In [335]:
poly1_data = polynomial_dataframe(sales['sqft_living'], 1)
poly1_data['price'] = sales['price']

In [336]:
poly1_data.head()


Out[336]:
power_1 price
19452 290 142000
15381 370 276000
860 380 245000
18379 384 265000
4868 390 228000

Create a regression model


In [337]:
model1 = linear_model.LinearRegression()
model1.fit(poly1_data[['power_1']], poly1_data['price'])


Out[337]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [338]:
plt.plot(poly1_data.power_1,poly1_data.price, '.',
poly1_data[['power_1']], model1.predict(poly1_data[['power_1']]),'-')
plt.ylabel('House price')
plt.xlabel('Sqft')


Out[338]:
<matplotlib.text.Text at 0x7f04c9647470>

Create a regression model for polynomials 2 and 3


In [339]:
poly3_data = polynomial_dataframe(sales['sqft_living'], 3)
poly3_data['price'] = sales['price']

In [340]:
poly3_data.head() # third polynomial


Out[340]:
power_1 power_2 power_3 price
19452 290 84100 24389000 142000
15381 370 136900 50653000 276000
860 380 144400 54872000 245000
18379 384 147456 56623104 265000
4868 390 152100 59319000 228000

In [341]:
model2 = linear_model.LinearRegression()
model2.fit(poly3_data[['power_2']], poly3_data['price'])

model3 = linear_model.LinearRegression()
model3.fit(poly3_data[['power_3']], poly3_data['price'])


Out[341]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

Regression model with a second degree polynomial


In [342]:
plt.plot(poly3_data[['power_2']], model2.predict(poly3_data[['price']]),'-')
plt.ylabel('House price')
plt.xlabel('Sqft polynomial degree 2')


Out[342]:
<matplotlib.text.Text at 0x7f04c9547518>

In [343]:
print ('Model2: ', model2.coef_)


Model2:  [ 0.04939429]

Regression model with a third degree polynomial


In [344]:
plt.plot(poly3_data[['power_3']], model3.predict(poly3_data[['price']]),'-')
plt.ylabel('House price')
plt.xlabel('Sqft polynomial degree 3')


Out[344]:
<matplotlib.text.Text at 0x7f04c94989b0>

In [345]:
print ('Model3: ', model3.coef_)


Model3:  [  6.29505518e-06]

Estimate the 15th degree wiht the four house sets


In [346]:
poly15_set1_data = polynomial_dataframe(house_set1['sqft_living'], 15)
poly15_set1_data['price'] = house_set1['price']

poly15_set2_data = polynomial_dataframe(house_set2['sqft_living'], 15)
poly15_set2_data['price'] = house_set2['price']

poly15_set3_data = polynomial_dataframe(house_set3['sqft_living'], 15)
poly15_set3_data['price'] = house_set3['price']

poly15_set4_data = polynomial_dataframe(house_set4['sqft_living'], 15)
poly15_set4_data['price'] = house_set4['price']

In [347]:
poly15_set1_data.head()


Out[347]:
power_1 power_2 power_3 power_4 power_5 power_6 power_7 power_8 power_9 power_10 power_11 power_12 power_13 power_14 power_15 price
0 430 184900 79507000 34188010000 1.470084e+13 6.321363e+15 2.718186e+18 1.168820e+21 5.025926e+23 2.161148e+26 9.292937e+28 3.995963e+31 1.718264e+34 7.388536e+36 3.177070e+39 80000
1 460 211600 97336000 44774560000 2.059630e+13 9.474297e+15 4.358177e+18 2.004761e+21 9.221902e+23 4.242075e+26 1.951354e+29 8.976230e+31 4.129066e+34 1.899370e+37 8.737103e+39 247000
2 470 220900 103823000 48796810000 2.293450e+13 1.077922e+16 5.066231e+18 2.381129e+21 1.119130e+24 5.259913e+26 2.472159e+29 1.161915e+32 5.461000e+34 2.566670e+37 1.206335e+40 192500
3 490 240100 117649000 57648010000 2.824752e+13 1.384129e+16 6.782231e+18 3.323293e+21 1.628414e+24 7.979227e+26 3.909821e+29 1.915812e+32 9.387480e+34 4.599865e+37 2.253934e+40 150000
4 500 250000 125000000 62500000000 3.125000e+13 1.562500e+16 7.812500e+18 3.906250e+21 1.953125e+24 9.765625e+26 4.882812e+29 2.441406e+32 1.220703e+35 6.103516e+37 3.051758e+40 125000

In [348]:
model_poly15_set1 = linear_model.LinearRegression()
model_poly15_set1.fit(poly15_set1_data[['power_15']], poly15_set1_data['price'])

model_poly15_set2 = linear_model.LinearRegression()
model_poly15_set2.fit(poly15_set2_data[['power_15']], poly15_set2_data['price'])

model_poly15_set3 = linear_model.LinearRegression()
model_poly15_set3.fit(poly15_set3_data[['power_15']], poly15_set3_data['price'])

model_poly15_set4 = linear_model.LinearRegression()
model_poly15_set4.fit(poly15_set4_data[['power_15']], poly15_set4_data['price'])


Out[348]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

House set 1 regression model for 15th degree polynomial


In [349]:
plt.plot(poly15_set1_data.power_15 ,poly15_set1_data.price, '.',
poly15_set1_data[['power_15']], model_poly15_set1.predict(poly15_set1_data[['price']]),'-')
plt.ylabel('House price')
plt.xlabel('Sqft polynomial degree 15')


Out[349]:
<matplotlib.text.Text at 0x7f04c950ebe0>

In [350]:
#pint ('Model for set1: ', model_poly15_set1.coef_)

House set 2 regression model for 15th degree polynomial


In [351]:
plt.plot(poly15_set2_data.power_15 ,poly15_set2_data.price, '.',
poly15_set2_data[['power_15']], model_poly15_set2.predict(poly15_set2_data[['price']]),'-')
plt.ylabel('House price')
plt.xlabel('Sqft polynomial degree 15')


Out[351]:
<matplotlib.text.Text at 0x7f04c94f19b0>

In [352]:
print ('Model for set12: ', model_poly15_set2.coef_)


Model for set12:  [  1.86753194e-53]

House set 3 regression model for 15th degree polynomial


In [353]:
plt.plot(poly15_set3_data.power_15 ,poly15_set3_data.price, '.',
poly15_set3_data[['power_15']], model_poly15_set3.predict(poly15_set3_data[['price']]),'-')
plt.ylabel('House price')
plt.xlabel('Sqft polynomial degree 15')


Out[353]:
<matplotlib.text.Text at 0x7f04c9579cf8>

In [354]:
print ('Model for set3: ', model_poly15_set3.coef_)


Model for set3:  [  6.78755526e-54]

House set 4 regression model for 15th degree polynomial


In [355]:
plt.plot(poly15_set4_data.power_15 ,poly15_set4_data.price, '.',
poly15_set4_data[['power_15']], model_poly15_set4.predict(poly15_set4_data[['price']]),'-')
plt.ylabel('House price')
plt.xlabel('Sqft polynomial degree 15')


Out[355]:
<matplotlib.text.Text at 0x7f04c9431358>

In [356]:
print ('Model for set4: ', model_poly15_set4.coef_)


Model for set4:  [  1.19721477e-52]

Finding the optimal polynomial degree


In [357]:
# Engineering the test set
poly_data_test = polynomial_dataframe(house_test['sqft_living'], 15)
poly_data_test['price'] = house_test['price']
poly_data_test.head()

# Engineering the validation set
poly_data_validation = polynomial_dataframe(house_valid['sqft_living'], 15)
poly_data_validation['price'] = house_valid['price']
poly_data_validation.head()


Out[357]:
power_1 power_2 power_3 power_4 power_5 power_6 power_7 power_8 power_9 power_10 power_11 power_12 power_13 power_14 power_15 price
0 1180 1392400 1643032000 1.938778e+12 2.287758e+15 2.699554e+18 3.185474e+21 3.758859e+24 4.435454e+27 5.233836e+30 6.175926e+33 7.287593e+36 8.599359e+39 1.014724e+43 1.197375e+46 221900
1 2570 6604900 16974593000 4.362470e+13 1.121155e+17 2.881368e+20 7.405116e+23 1.903115e+27 4.891005e+30 1.256988e+34 3.230460e+37 8.302282e+40 2.133686e+44 5.483574e+47 1.409279e+51 538000
2 770 592900 456533000 3.515304e+11 2.706784e+14 2.084224e+17 1.604852e+20 1.235736e+23 9.515169e+25 7.326680e+28 5.641544e+31 4.343989e+34 3.344871e+37 2.575551e+40 1.983174e+43 180000
3 1680 2822400 4741632000 7.965942e+12 1.338278e+16 2.248307e+19 3.777156e+22 6.345623e+25 1.066065e+29 1.790989e+32 3.008861e+35 5.054886e+38 8.492209e+41 1.426691e+45 2.396841e+48 510000
4 1715 2941225 5044200875 8.650805e+12 1.483613e+16 2.544396e+19 4.363640e+22 7.483642e+25 1.283445e+29 2.201107e+32 3.774899e+35 6.473952e+38 1.110283e+42 1.904135e+45 3.265592e+48 257500

In [358]:
import sys

index = ['power_1','power_2','power_3','power_4','power_5','power_6','power_7','power_8','power_9',
        'power_10','power_11','power_12','power_13','power_14','power_15']

for power in range(1, 16):
    name = 'power_' + str(power)
    
    # Build a data set
    poly_data_training = polynomial_dataframe(house_train['sqft_living'], power)
    poly_data_training['price'] = house_train['price']
    
    # Build a model and fit it using the training data
    model = linear_model.LinearRegression()
    model.fit(poly_data_training[index[0:power]],  poly_data_training['price'])
    
    # Compute the RSS on the test set
    RSS = ((model.predict(poly_data_validation[index[0:power]]) - poly_data_validation.price) ** 2).sum()
    print('The RSS for ', power,'th degree polynomial: ', RSS)
    
    if(power == 1):
        min = RSS
    
    # Save the min RSS
    if(RSS < min):
        min = RSS
        minPower = power
        optimalModel = model


The RSS for  1 th degree polynomial:  629097886299587.5
The RSS for  2 th degree polynomial:  623955062706519.1
The RSS for  3 th degree polynomial:  625820280251530.6
The RSS for  4 th degree polynomial:  629987341468499.8
The RSS for  5 th degree polynomial:  628240679314405.9
The RSS for  6 th degree polynomial:  566268593930554.5
The RSS for  7 th degree polynomial:  1073845517537453.6
The RSS for  8 th degree polynomial:  7087872270340538.0
The RSS for  9 th degree polynomial:  4.530360160665216e+16
The RSS for  10 th degree polynomial:  2.475699114375292e+17
The RSS for  11 th degree polynomial:  1.193782560132884e+18
The RSS for  12 th degree polynomial:  5.092665343583419e+18
The RSS for  13 th degree polynomial:  7.616230021284306e+17
The RSS for  14 th degree polynomial:  2.2975609250061896e+18
The RSS for  15 th degree polynomial:  6.955038097253948e+18

In [359]:
print ('The minimum RSS: ', min, ' for the ', minPower,'th degree polynomial')


The minimum RSS:  566268593930554.5  for the  6 th degree polynomial

Testing the optimal model on the test set


In [360]:
name = 'power_' + str(minPower)
print("Test RSS: %.2f" % ((optimalModel.predict(poly_data_test[index[0:minPower]]) - poly_data_test['price']) ** 2).sum())


Test RSS: 135225114656218.83

In [ ]: